library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych)
## Warning: package 'psych' was built under R version 4.0.5
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## v purrr 0.3.4
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::src() masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(skimr)
## Warning: package 'skimr' was built under R version 4.0.5
library(purrr)
library(tidyr)
library(tidyverse)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
location <- "home"
if (location == "work"){
dfTrain <- read.csv("C:\\Users\\eric.hirsch\\Desktop\\Rstudio\\CUNY_621\\Baseball\\moneyball-training-data.csv", header=TRUE)
dfEval <- read.csv("C:\\Users\\eric.hirsch\\Desktop\\RStudio\\CUNY_621\\Baseball\\moneyball-evaluation-data.csv", header=TRUE)
} else
{
dfTrain <- read.csv("D:\\RStudio\\CUNY_621\\Baseball\\moneyball-training-data.csv", header=TRUE)
dfEval <- read.csv("D:\\RStudio\\CUNY_621\\Baseball\\moneyball-evaluation-data.csv", header=TRUE)
}
colnames(dfTrain)<-gsub("TEAM_","",colnames(dfTrain))
dfTrain2 <- dfTrain
summary(dfTrain)
## INDEX TARGET_WINS BATTING_H BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## BATTING_3B BATTING_HR BATTING_BB BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## BASERUN_SB BASERUN_CS BATTING_HBP PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## PITCHING_HR PITCHING_BB PITCHING_SO FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
summary(dfEval)
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## Min. : 9 Min. : 819 Min. : 44.0 Min. : 14.00
## 1st Qu.: 708 1st Qu.:1387 1st Qu.:210.0 1st Qu.: 35.00
## Median :1249 Median :1455 Median :239.0 Median : 52.00
## Mean :1264 Mean :1469 Mean :241.3 Mean : 55.91
## 3rd Qu.:1832 3rd Qu.:1548 3rd Qu.:278.5 3rd Qu.: 72.00
## Max. :2525 Max. :2170 Max. :376.0 Max. :155.00
##
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## Min. : 0.00 Min. : 15.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 44.50 1st Qu.:436.5 1st Qu.: 545.0 1st Qu.: 59.0
## Median :101.00 Median :509.0 Median : 686.0 Median : 92.0
## Mean : 95.63 Mean :499.0 Mean : 709.3 Mean :123.7
## 3rd Qu.:135.50 3rd Qu.:565.5 3rd Qu.: 912.0 3rd Qu.:151.8
## Max. :242.00 Max. :792.0 Max. :1268.0 Max. :580.0
## NA's :18 NA's :13
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## Min. : 0.00 Min. :42.00 Min. : 1155 Min. : 0.0
## 1st Qu.: 38.00 1st Qu.:53.50 1st Qu.: 1426 1st Qu.: 52.0
## Median : 49.50 Median :62.00 Median : 1515 Median :104.0
## Mean : 52.32 Mean :62.37 Mean : 1813 Mean :102.1
## 3rd Qu.: 63.00 3rd Qu.:67.50 3rd Qu.: 1681 3rd Qu.:142.5
## Max. :154.00 Max. :96.00 Max. :22768 Max. :336.0
## NA's :87 NA's :240
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. : 136.0 Min. : 0.0 Min. : 73.0 Min. : 69.0
## 1st Qu.: 471.0 1st Qu.: 613.0 1st Qu.: 131.0 1st Qu.:131.0
## Median : 526.0 Median : 745.0 Median : 163.0 Median :148.0
## Mean : 552.4 Mean : 799.7 Mean : 249.7 Mean :146.1
## 3rd Qu.: 606.5 3rd Qu.: 938.0 3rd Qu.: 252.0 3rd Qu.:164.0
## Max. :2008.0 Max. :9963.0 Max. :1568.0 Max. :204.0
## NA's :18 NA's :31
We begin with an initial exporation of the dataset.
dim(dfTrain)
## [1] 2276 17
summary(dfTrain)
## INDEX TARGET_WINS BATTING_H BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## BATTING_3B BATTING_HR BATTING_BB BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## BASERUN_SB BASERUN_CS BATTING_HBP PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## PITCHING_HR PITCHING_BB PITCHING_SO FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
skim(dfTrain)
| Name | dfTrain |
| Number of rows | 2276 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| numeric | 17 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| INDEX | 0 | 1.00 | 1268.46 | 736.35 | 1 | 630.75 | 1270.5 | 1915.50 | 2535 | ▇▇▇▇▇ |
| TARGET_WINS | 0 | 1.00 | 80.79 | 15.75 | 0 | 71.00 | 82.0 | 92.00 | 146 | ▁▁▇▅▁ |
| BATTING_H | 0 | 1.00 | 1469.27 | 144.59 | 891 | 1383.00 | 1454.0 | 1537.25 | 2554 | ▁▇▂▁▁ |
| BATTING_2B | 0 | 1.00 | 241.25 | 46.80 | 69 | 208.00 | 238.0 | 273.00 | 458 | ▁▆▇▂▁ |
| BATTING_3B | 0 | 1.00 | 55.25 | 27.94 | 0 | 34.00 | 47.0 | 72.00 | 223 | ▇▇▂▁▁ |
| BATTING_HR | 0 | 1.00 | 99.61 | 60.55 | 0 | 42.00 | 102.0 | 147.00 | 264 | ▇▆▇▅▁ |
| BATTING_BB | 0 | 1.00 | 501.56 | 122.67 | 0 | 451.00 | 512.0 | 580.00 | 878 | ▁▁▇▇▁ |
| BATTING_SO | 102 | 0.96 | 735.61 | 248.53 | 0 | 548.00 | 750.0 | 930.00 | 1399 | ▁▆▇▇▁ |
| BASERUN_SB | 131 | 0.94 | 124.76 | 87.79 | 0 | 66.00 | 101.0 | 156.00 | 697 | ▇▃▁▁▁ |
| BASERUN_CS | 772 | 0.66 | 52.80 | 22.96 | 0 | 38.00 | 49.0 | 62.00 | 201 | ▃▇▁▁▁ |
| BATTING_HBP | 2085 | 0.08 | 59.36 | 12.97 | 29 | 50.50 | 58.0 | 67.00 | 95 | ▂▇▇▅▁ |
| PITCHING_H | 0 | 1.00 | 1779.21 | 1406.84 | 1137 | 1419.00 | 1518.0 | 1682.50 | 30132 | ▇▁▁▁▁ |
| PITCHING_HR | 0 | 1.00 | 105.70 | 61.30 | 0 | 50.00 | 107.0 | 150.00 | 343 | ▇▇▆▁▁ |
| PITCHING_BB | 0 | 1.00 | 553.01 | 166.36 | 0 | 476.00 | 536.5 | 611.00 | 3645 | ▇▁▁▁▁ |
| PITCHING_SO | 102 | 0.96 | 817.73 | 553.09 | 0 | 615.00 | 813.5 | 968.00 | 19278 | ▇▁▁▁▁ |
| FIELDING_E | 0 | 1.00 | 246.48 | 227.77 | 65 | 127.00 | 159.0 | 249.25 | 1898 | ▇▁▁▁▁ |
| FIELDING_DP | 286 | 0.87 | 146.39 | 26.23 | 52 | 131.00 | 149.0 | 164.00 | 228 | ▁▂▇▆▁ |
str(dfTrain)
## 'data.frame': 2276 obs. of 17 variables:
## $ INDEX : int 1 2 3 4 5 6 7 8 11 12 ...
## $ TARGET_WINS: int 39 70 86 70 82 75 80 85 86 76 ...
## $ BATTING_H : int 1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
## $ BATTING_2B : int 194 219 232 209 186 200 179 171 197 213 ...
## $ BATTING_3B : int 39 22 35 38 27 36 54 37 40 18 ...
## $ BATTING_HR : int 13 190 137 96 102 92 122 115 114 96 ...
## $ BATTING_BB : int 143 685 602 451 472 443 525 456 447 441 ...
## $ BATTING_SO : int 842 1075 917 922 920 973 1062 1027 922 827 ...
## $ BASERUN_SB : int NA 37 46 43 49 107 80 40 69 72 ...
## $ BASERUN_CS : int NA 28 27 30 39 59 54 36 27 34 ...
## $ BATTING_HBP: int NA NA NA NA NA NA NA NA NA NA ...
## $ PITCHING_H : int 9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
## $ PITCHING_HR: int 84 191 137 97 102 92 122 116 114 96 ...
## $ PITCHING_BB: int 927 689 602 454 472 443 525 459 447 441 ...
## $ PITCHING_SO: int 5456 1082 917 928 920 973 1062 1033 922 827 ...
## $ FIELDING_E : int 1011 193 175 164 138 123 136 112 127 131 ...
## $ FIELDING_DP: int NA 155 153 156 168 149 186 136 169 159 ...
Batting_HPBA has too many so we remove it:
dfTrain2 <- dfTrain2 %>%
dplyr::select(-BATTING_HBP)
Before we impute the values for NAs, we need to ensure there isn’t any kind of grouping effect for the records with NA. Fact that several columns have the same number of missings suggests there might be. So first we look to see if the missings are correlated:
dfTrain2 <- dfTrain2 %>%
mutate(Missing_Flag = ifelse(is.na(BATTING_SO),1,0))
dfTrain3 <- dfTrain2 %>%
dplyr::filter(Missing_Flag == 0) %>%
dplyr::select(BATTING_SO, PITCHING_SO, BASERUN_CS, BASERUN_SB)
summary(dfTrain3)
## BATTING_SO PITCHING_SO BASERUN_CS BASERUN_SB
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 548.0 1st Qu.: 615.0 1st Qu.: 38.0 1st Qu.: 65.0
## Median : 750.0 Median : 813.5 Median : 49.0 Median : 98.0
## Mean : 735.6 Mean : 817.7 Mean : 52.8 Mean :120.8
## 3rd Qu.: 930.0 3rd Qu.: 968.0 3rd Qu.: 62.0 3rd Qu.:147.0
## Max. :1399.0 Max. :19278.0 Max. :201.0 Max. :697.0
## NA's :670 NA's :131
There is some cohort effect as there is complete duplication with pitching so and batting so, and some overlap with baserun cs. Now lets impute the mean/median and see how well the new model performs vs the old:
dfTrain_ImputedMedian <- data.frame(
sapply(dfTrain2, function(x) ifelse(is.na(x), median(x, na.rm = TRUE), x)))
dfTrain_ImputedMean <- data.frame(
sapply(dfTrain2, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x)))
dfTrain_ImputedMean_NoCohort <- dfTrain_ImputedMean %>%
filter(Missing_Flag==0) %>%
dplyr::select(-Missing_Flag)
dfTrain_ImputedMedian_NoCohort <- dfTrain_ImputedMedian %>%
filter(Missing_Flag==0) %>%
dplyr::select(-Missing_Flag)
m1 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMedian)
m2 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMedian_NoCohort)
m3 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMean)
m4 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMean_NoCohort)
summary(m1)$adj.r.squared
## [1] 0.313437
summary(m2)$adj.r.squared
## [1] 0.3147084
summary(m3)$adj.r.squared
## [1] 0.3169625
summary(m4)$adj.r.squared
## [1] 0.3178529
There appears to be a minor effect. Imputing the mean to the other columns with NA and removing cohort records has a very small positive effect on the model.
Now we can look at interactions between the “cohort” and other variables:
EHExplore_Interactions_Scatterplots <- function(df, y, interaction) {
df <- select_if(df, is.numeric)
df[,interaction] <- as.factor(df[,interaction])
library(ggsci)
plot_list <- list()
for(i in 1:ncol(df)) {
p <- eval(substitute(ggplot(df, aes_string(df[ , i], y, color=interaction)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE) +
xlab("") +
theme(title = element_text(size=7), axis.title.x = element_text(size = 7), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), panel.grid.major.x = element_line(color="gray"), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "slategray1", color="darkslategray")) +
scale_color_d3()+
scale_fill_d3()+
ggtitle(colnames(df)[i]), list(i=i)))
plot_list[[i]] <- p
}
return(plot_list)
}
library(patchwork)
dfTmp <- dfTrain_ImputedMean %>%
mutate(Pitch_h_Under1500 = as.factor(ifelse(PITCHING_H<=1500, 1, 0)))
z1 <- EHExplore_Interactions_Scatterplots(dfTrain_ImputedMean, "TARGET_WINS", "Missing_Flag")
grid.arrange(grobs=z1[c(2:7)], ncol=2, nrow=3)
grid.arrange(grobs=z1[c(8:13)], ncol=2, nrow=3)
grid.arrange(grobs=z1[c(14:16)], ncol=2, nrow=3)
The interaction analysis suggests that the cohort is not random - there are numerous interactions with many other variables, some of which are quite counterinutitive (team pitching H). So we could either do a random effects/flag/interactions or toss them. Becuase bad data is not reproducible I will toss, at the expense of better predicitons if I can identify the cohort in the eval data.
No we can look at the stats in ur new dataset.
summary(dfTrain_ImputedMean_NoCohort)
## INDEX TARGET_WINS BATTING_H BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 640.2 1st Qu.: 71.00 1st Qu.:1389 1st Qu.:211.2
## Median :1275.5 Median : 82.00 Median :1458 Median :240.0
## Mean :1275.2 Mean : 80.76 Mean :1475 Mean :243.9
## 3rd Qu.:1923.8 3rd Qu.: 91.00 3rd Qu.:1541 3rd Qu.:275.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
## BATTING_3B BATTING_HR BATTING_BB BATTING_SO
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 48.0 1st Qu.:456.0 1st Qu.: 548.0
## Median : 46.00 Median :107.0 Median :517.0 Median : 750.0
## Mean : 54.45 Mean :103.4 Mean :505.1 Mean : 735.6
## 3rd Qu.: 71.00 3rd Qu.:148.0 3rd Qu.:582.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.0 Max. :878.0 Max. :1399.0
## BASERUN_SB BASERUN_CS PITCHING_H PITCHING_HR
## Min. : 0.0 Min. : 0.0 Min. : 1137 Min. : 0.0
## 1st Qu.: 66.0 1st Qu.: 44.0 1st Qu.: 1425 1st Qu.: 58.0
## Median :102.0 Median : 52.8 Median : 1521 Median :111.0
## Mean :121.1 Mean : 52.8 Mean : 1794 Mean :109.7
## 3rd Qu.:143.8 3rd Qu.: 55.0 3rd Qu.: 1694 3rd Qu.:152.8
## Max. :697.0 Max. :201.0 Max. :30132 Max. :343.0
## PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP
## Min. : 0.0 Min. : 0.0 Min. : 65.0 Min. : 52.0
## 1st Qu.: 479.2 1st Qu.: 615.0 1st Qu.: 126.0 1st Qu.:137.0
## Median : 542.0 Median : 813.5 Median : 155.0 Median :146.4
## Mean : 557.5 Mean : 817.7 Mean : 243.9 Mean :148.6
## 3rd Qu.: 614.8 3rd Qu.: 968.0 3rd Qu.: 234.0 3rd Qu.:162.0
## Max. :3645.0 Max. :19278.0 Max. :1898.0 Max. :228.0
EHExplore_Outliers_Boxplots <- function(df, size="small")
{
df <- select_if(df, is.numeric)
s <- 7
if (size=="large") {
s <- 10
}
plot_list2 <- list()
for(i in 1:ncol(df)) {
qp <- toString(head(sort(round(df[,i],2)),5))
qz <- toString(tail(sort(round(df[,i],2)),5))
qk <- str_c("L: ", qp, "\\\n", "H: ", qz)
p <- eval(substitute(ggplot(df, aes(df[,i])) +
coord_flip() +
xlab(colnames(df)[i]) +
ylab(qk) +
theme(axis.title.x = element_text(size = s), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "slategray2", color="darkslategray")) +
geom_boxplot(), list(i=i)))
plot_list2[[i]] <- p
}
return (plot_list2)
}
z <- EHExplore_Outliers_Boxplots(dfTrain_ImputedMean_NoCohort, "small")
wrap_plots(z)
EHExplore_Distributions_Histograms <- function(df, size = "small", nbins = 100)
{
df <- select_if(df, is.numeric)
s <- 7
if (size=="large") {
s <- 10
}
plot_list2 <- list()
for(i in 1:ncol(df)) {
qp <- toString(head(sort(round(df[,i],2)),5))
qz <- toString(tail(sort(round(df[,i],2)),5))
qk <- str_c("L: ", qp, "\\\n", "H: ", qz)
p <- eval(substitute(ggplot(df, aes(df[,i])) +
ylab(colnames(df)[i]) +
xlab(qk) +
theme(axis.title.x = element_text(size = s), axis.title.y = element_text(size = 9), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(size=8), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_blank(), panel.background = element_rect(fill = "slategray2", color="darkslategray")) +
geom_histogram(bins=nbins, fill="white", aes(y = stat(density))) +
geom_density(col = "red"), list(i=i)))
plot_list2[[i]] <- p
}
return (plot_list2)
}
z6 <- EHExplore_Distributions_Histograms(dfTrain_ImputedMean_NoCohort, "small")
wrap_plots(z6)
EHExplore_CombineGraphs_2 <- function(list1, list2) {
zz7 <- list()
for(i in 1:length(list1)) {
zz7[i*2-1] <- list1[i]
zz7[i*2] <- list2[i]
}
return (zz7)
}
#zz1 <- EHExplore_CombineGraphs(z, z4, z6)
#grid.arrange(grobs=zz1[c(1:16)], ncol=4, nrow=4)
#grid.arrange(grobs=zz1[c(17:32)], ncol=4, nrow=4)
There are 4 categories where 0s may be nas: Pitching and Batting HR and Pitching and batting SO. We look more closely at these categories:
dfTrain_ZeroAsNA <- dfTrain %>%
dplyr::select(PITCHING_SO, PITCHING_HR, BATTING_SO, BATTING_HR)
hist(dfTrain_ZeroAsNA)
We can check to see if the zeroes behave like nas or actual values. We compare the interaction with Pitching_h in both cases. They behave very differently, neither like the overall sample:
dfTmp <- dfTrain_ImputedMean %>%
mutate(Zeros = ifelse(PITCHING_SO <= 0, 1, 0))
z2 <- EHExplore_Interactions_Scatterplots(dfTmp, "TARGET_WINS", "Zeros")
grid.arrange(z2[[11]], z1[[11]], ncol=2, nrow=3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Looking for other gorups, Hard to say - there seems to be something about lower so being more negatively correlated with wins than later - but the ns may be small:
dfTmp <- dfTrain_ImputedMean %>%
mutate(Zeros = ifelse(PITCHING_SO <= 400 & PITCHING_SO >=0, 1, 0))
dfx <- dfTmp %>%
filter(Zeros==1)
z2 <- EHExplore_Interactions_Scatterplots(dfTmp, "TARGET_WINS", "Zeros")
grid.arrange(z2[[11]], z1[[11]], ncol=2, nrow=3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Will do nothing with outliers or na as zero for now
hist(dfTrain$TARGET_WINS, bins=20)
## Warning in plot.window(xlim, ylim, "", ...): "bins" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "bins"
## is not a graphical parameter
## Warning in axis(1, ...): "bins" is not a graphical parameter
## Warning in axis(2, ...): "bins" is not a graphical parameter
head(sort(dfTrain$TARGET_WINS))
## [1] 0 12 14 17 21 22
dfTrain_ZeroWins <- dfTrain %>%
dplyr::filter(TARGET_WINS ==0)
head(dfTrain_ZeroWins, 1)
## INDEX TARGET_WINS BATTING_H BATTING_2B BATTING_3B BATTING_HR BATTING_BB
## 1 1347 0 891 135 0 0 0
## BATTING_SO BASERUN_SB BASERUN_CS BATTING_HBP PITCHING_H PITCHING_HR
## 1 0 0 0 NA 24057 0
## PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP
## 1 0 0 1890 NA
Target_Wins appears normally distributed - the zero is suspicious but I’m going to leave it.
dfCor <- as.data.frame(cor(dfTrain_ImputedMean_NoCohort))
dfCor
## INDEX TARGET_WINS BATTING_H BATTING_2B BATTING_3B
## INDEX 1.0000000000 -0.02928140 -0.03131390 -0.003976934 -0.00497585
## TARGET_WINS -0.0292813985 1.00000000 0.39476995 0.293205037 0.13685882
## BATTING_H -0.0313139014 0.39476995 1.00000000 0.540648272 0.45802046
## BATTING_2B -0.0039769341 0.29320504 0.54064827 1.000000000 -0.08532550
## BATTING_3B -0.0049758496 0.13685882 0.45802046 -0.085325497 1.00000000
## BATTING_HR 0.0413809930 0.19059035 -0.06194956 0.393641975 -0.63765753
## BATTING_BB -0.0358540809 0.23250609 -0.10545406 0.230196649 -0.28160593
## BATTING_SO 0.0814501106 -0.03175071 -0.46385357 0.162685188 -0.66978119
## BASERUN_SB 0.0435154747 0.11143414 0.14886129 -0.153728585 0.49301668
## BASERUN_CS 0.0004632733 0.01610843 0.01198251 -0.077632602 0.19833581
## PITCHING_H 0.0146890757 -0.11576530 0.29979491 0.008872511 0.20396690
## PITCHING_HR 0.0403725584 0.20531868 0.02082589 0.412455481 -0.56629509
## PITCHING_BB -0.0233549401 0.12063924 0.07067846 0.149565361 0.01294580
## PITCHING_SO 0.0558901457 -0.07843609 -0.25265679 0.064792315 -0.25881893
## FIELDING_E -0.0068738726 -0.17639551 0.28252119 -0.232247607 0.51354615
## FIELDING_DP 0.0061318975 -0.02860414 0.04535652 0.178563220 -0.21908499
## BATTING_HR BATTING_BB BATTING_SO BASERUN_SB BASERUN_CS
## INDEX 0.04138099 -0.03585408 0.08145011 0.04351547 0.0004632733
## TARGET_WINS 0.19059035 0.23250609 -0.03175071 0.11143414 0.0161084320
## BATTING_H -0.06194956 -0.10545406 -0.46385357 0.14886129 0.0119825143
## BATTING_2B 0.39364197 0.23019665 0.16268519 -0.15372858 -0.0776326024
## BATTING_3B -0.63765753 -0.28160593 -0.66978119 0.49301668 0.1983358054
## BATTING_HR 1.00000000 0.50439692 0.72706935 -0.39942181 -0.3034743273
## BATTING_BB 0.50439692 1.00000000 0.37975087 -0.06545891 -0.0861202523
## BATTING_SO 0.72706935 0.37975087 1.00000000 -0.23837153 -0.1566149092
## BASERUN_SB -0.39942181 -0.06545891 -0.23837153 1.00000000 0.2869124889
## BASERUN_CS -0.30347433 -0.08612025 -0.15661491 0.28691249 1.0000000000
## PITCHING_H -0.27656010 -0.46585690 -0.37568637 0.07198568 -0.0369545996
## PITCHING_HR 0.96659392 0.44681242 0.66717889 -0.36564098 -0.3034478040
## PITCHING_BB 0.10677385 0.47385394 0.03700514 0.14323815 -0.0542531880
## PITCHING_SO 0.18470756 -0.02075682 0.41623330 -0.05615058 -0.0686217842
## FIELDING_E -0.59891151 -0.66138116 -0.58466444 0.36999309 0.0236201201
## FIELDING_DP 0.33368751 0.32158157 0.14599850 -0.24957358 -0.1563091914
## PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO FIELDING_E
## INDEX 0.014689076 0.04037256 -0.02335494 0.05589015 -0.006873873
## TARGET_WINS -0.115765302 0.20531868 0.12063924 -0.07843609 -0.176395507
## BATTING_H 0.299794910 0.02082589 0.07067846 -0.25265679 0.282521195
## BATTING_2B 0.008872511 0.41245548 0.14956536 0.06479231 -0.232247607
## BATTING_3B 0.203966905 -0.56629509 0.01294580 -0.25881893 0.513546149
## BATTING_HR -0.276560100 0.96659392 0.10677385 0.18470756 -0.598911507
## BATTING_BB -0.465856896 0.44681242 0.47385394 -0.02075682 -0.661381160
## BATTING_SO -0.375686369 0.66717889 0.03700514 0.41623330 -0.584664436
## BASERUN_SB 0.071985680 -0.36564098 0.14323815 -0.05615058 0.369993094
## BASERUN_CS -0.036954600 -0.30344780 -0.05425319 -0.06862178 0.023620120
## PITCHING_H 1.000000000 -0.16448724 0.31845282 0.26724807 0.672838853
## PITCHING_HR -0.164487236 1.00000000 0.19575531 0.20588053 -0.501758136
## PITCHING_BB 0.318452818 0.19575531 1.00000000 0.48849865 -0.016375919
## PITCHING_SO 0.267248074 0.20588053 0.48849865 1.00000000 -0.023291783
## FIELDING_E 0.672838853 -0.50175814 -0.01637592 -0.02329178 1.000000000
## FIELDING_DP -0.088957308 0.32336753 0.15211734 0.01039232 -0.257897297
## FIELDING_DP
## INDEX 0.006131897
## TARGET_WINS -0.028604138
## BATTING_H 0.045356517
## BATTING_2B 0.178563220
## BATTING_3B -0.219084985
## BATTING_HR 0.333687510
## BATTING_BB 0.321581568
## BATTING_SO 0.145998500
## BASERUN_SB -0.249573580
## BASERUN_CS -0.156309191
## PITCHING_H -0.088957308
## PITCHING_HR 0.323367525
## PITCHING_BB 0.152117341
## PITCHING_SO 0.010392318
## FIELDING_E -0.257897297
## FIELDING_DP 1.000000000
heatmap(as.matrix(dfCor), Rowv = NA, Colv = NA)
Invsteigate suspicious HR category
cor.test(dfTrain$PITCHING_HR, dfTrain$TARGET_WINS)
##
## Pearson's product-moment correlation
##
## data: dfTrain$PITCHING_HR and dfTrain$TARGET_WINS
## t = 9.1789, df = 2274, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1490846 0.2283275
## sample estimates:
## cor
## 0.1890137
ggplot(dfTrain, aes(PITCHING_HR, BATTING_HR, color=INDEX)) +
geom_point()
hist(dfTrain$PITCHING_HR, breaks=100)
plot(dfTrain$PITCHING_HR, dfTrain$TARGET_WINS)
EHModel_PrintSummary <- function(model)
{
print(summary(model))
par(mfcol=c(2,2))
print(plot(model))
}
m1 <- lm(TARGET_WINS ~ PITCHING_HR, data=dfTrain)
summary(m1)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_HR, data = dfTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.657 -9.956 0.636 10.055 67.477
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.656920 0.646540 117.018 <2e-16 ***
## PITCHING_HR 0.048572 0.005292 9.179 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.47 on 2274 degrees of freedom
## Multiple R-squared: 0.03573, Adjusted R-squared: 0.0353
## F-statistic: 84.25 on 1 and 2274 DF, p-value: < 2.2e-16
plot(m1)
library(car)
## Warning: package 'car' was built under R version 4.0.5
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:psych':
##
## logit
influencePlot(m1, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter
## StudRes Hat CookD
## 299 4.380293 0.0006944747 0.0066141630
## 832 0.173993 0.0070267976 0.0001071615
## 964 -1.050146 0.0058117315 0.0032231964
## 1211 -4.919225 0.0017463018 0.0209523937
## 2233 -4.132563 0.0017463018 0.0148329515
dfTrain2 <- dfTrain[-c(1211,2233,299,1825, 832), ]
cor.test(dfTrain2$PITCHING_HR, dfTrain2$TARGET_WINS)
##
## Pearson's product-moment correlation
##
## data: dfTrain2$PITCHING_HR and dfTrain2$TARGET_WINS
## t = 8.8525, df = 2269, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1426547 0.2221771
## sample estimates:
## cor
## 0.1827147
m2 <- lm(TARGET_WINS ~ PITCHING_HR, data=dfTrain2)
summary(m2)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_HR, data = dfTrain2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.949 -9.929 0.614 10.028 55.992
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.948820 0.639361 118.789 <2e-16 ***
## PITCHING_HR 0.046356 0.005237 8.852 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.23 on 2269 degrees of freedom
## Multiple R-squared: 0.03338, Adjusted R-squared: 0.03296
## F-statistic: 78.37 on 1 and 2269 DF, p-value: < 2.2e-16
plot(m2)
library(car)
influencePlot(m2, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter
## StudRes Hat CookD
## 964 -1.039523 0.0058683344 0.003189286
## 982 -3.886600 0.0017628831 0.013255859
## 1810 2.114722 0.0049482791 0.011102505
## 1882 -1.303158 0.0058683344 0.005010737
## 2012 3.688318 0.0006272236 0.004245374
summary(m1$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -75.6569 -9.9562 0.6359 0.0000 10.0552 67.4774
describe(m1$residuals)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2276 0 15.47 0.64 0.2 14.84 -75.66 67.48 143.13 -0.18 0.86
## se
## X1 0.32
dfTrain$Residuals <- m1$residuals
dfTrain$Fitted <- m1$fitted.values
library(tidyverse)
dfTrain_WithoutHR <- dfTrain %>%
dplyr::filter(TARGET_WINS >=50 | PITCHING_HR!=0)
hist(dfTrain_WithoutHR$PITCHING_HR)
plot(dfTrain_WithoutHR$PITCHING_HR, dfTrain_WithoutHR$TARGET_WINS)
m3 <- lm(TARGET_WINS ~ PITCHING_HR, data=dfTrain_WithoutHR)
summary(m3)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_HR, data = dfTrain_WithoutHR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.208 -9.802 0.653 9.952 66.914
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.624136 0.636539 120.376 < 2e-16 ***
## PITCHING_HR 0.041723 0.005197 8.028 1.58e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.09 on 2263 degrees of freedom
## Multiple R-squared: 0.02769, Adjusted R-squared: 0.02726
## F-statistic: 64.45 on 1 and 2263 DF, p-value: 1.576e-15
plot(m3)
library(car)
influencePlot(m3, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter
## StudRes Hat CookD
## 299 4.4557422 0.0007060697 0.0069560265
## 829 0.2703629 0.0070966028 0.0002613277
## 856 -3.7394216 0.0014507753 0.0101000850
## 961 -0.9956483 0.0058665293 0.0029249611
## 1804 2.1826581 0.0049451007 0.0118181032
dfTrain_BiModal <- dfTrain %>%
mutate(HR_Low = if_else(PITCHING_HR<50,1,0)) %>%
mutate(HR_High = if_else(PITCHING_HR>=50,1,0))
dfCor_BiModal <- as.data.frame(cor(dfTrain_BiModal))
m4 <- lm(TARGET_WINS ~ PITCHING_HR + HR_Low, data=dfTrain_BiModal)
summary(m4)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_HR + HR_Low, data = dfTrain_BiModal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.692 -9.976 0.653 10.058 67.556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.529253 1.069339 70.632 < 2e-16 ***
## PITCHING_HR 0.049398 0.007641 6.465 1.24e-10 ***
## HR_Low 0.162504 1.084033 0.150 0.881
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.47 on 2273 degrees of freedom
## Multiple R-squared: 0.03574, Adjusted R-squared: 0.03489
## F-statistic: 42.12 on 2 and 2273 DF, p-value: < 2.2e-16
plot(m4)
dfHighHR <- dfTrain_BiModal %>%
dplyr::filter(HR_High ==1)
dfLowHR <- dfTrain_BiModal %>%
dplyr::filter(HR_Low==1)
t.test(dfLowHR$TARGET_WINS, dfHighHR$TARGET_WINS)
##
## Welch Two Sample t-test
##
## data: dfLowHR$TARGET_WINS and dfHighHR$TARGET_WINS
## t = -5.4141, df = 753, p-value = 8.291e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.665804 -3.118167
## sample estimates:
## mean of x mean of y
## 77.11327 82.00526
m5 <- lm(TARGET_WINS ~ PITCHING_HR, data=dfHighHR)
summary(m5)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_HR, data = dfHighHR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.641 -9.293 0.650 9.127 67.238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.107959 0.957161 79.514 < 2e-16 ***
## PITCHING_HR 0.044983 0.006848 6.569 6.72e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.73 on 1709 degrees of freedom
## Multiple R-squared: 0.02463, Adjusted R-squared: 0.02405
## F-statistic: 43.15 on 1 and 1709 DF, p-value: 6.72e-11
plot(m5)
dfCor_HR <- as.data.frame(cor(dfTrain_BiModal[-1], dfTrain_BiModal$PITCHING_HR))
dfCor_Low <- as.data.frame(cor(dfTrain_BiModal[-1], dfTrain_BiModal$HR_Low))
plot(dfTrain$BATTING_HR, dfTrain$PITCHING_HR)
dfTrain$HR_Diff <- dfTrain$PITCHING_HR -dfTrain$BATTING_HR
hist(dfTrain$HR_Diff, breaks=100)
describe(dfTrain$HR_Diff)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2276 6.09 15.1 2 2.93 2.97 -2 249 251 6.98 71.83 0.32
Sum of HR allowed greatly exceeds sum of HR hit
m6 <- lm(dfTrain$BATTING_HR ~ dfTrain$PITCHING_HR)
summary(m6)
##
## Call:
## lm(formula = dfTrain$BATTING_HR ~ dfTrain$PITCHING_HR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -234.609 0.123 1.336 6.992 12.817
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.592392 0.621547 -2.562 0.0105 *
## dfTrain$PITCHING_HR 0.957481 0.005087 188.217 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.87 on 2274 degrees of freedom
## Multiple R-squared: 0.9397, Adjusted R-squared: 0.9397
## F-statistic: 3.543e+04 on 1 and 2274 DF, p-value: < 2.2e-16
plot(m6)
cor.test(dfTrain$BATTING_BB, dfTrain$PITCHING_BB)
##
## Pearson's product-moment correlation
##
## data: dfTrain$BATTING_BB and dfTrain$PITCHING_BB
## t = 26.759, df = 2274, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4574724 0.5199930
## sample estimates:
## cor
## 0.4893613
plot(dfTrain$BATTING_BB, dfTrain$PITCHING_BB)
dfTrain_ImputedMedian <- dfTrain_ImputedMean_NoCohort
for(i in 2:ncol(dfTrain_ImputedMedian)) {
print(ggplot(dfTrain_ImputedMedian, aes(x = dfTrain_ImputedMedian[ ,i], y = dfTrain_ImputedMedian$TARGET_WINS)) +
xlab(colnames(dfTrain)[i]) +
stat_smooth(method=loess) +
geom_point())
m <- lm(dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[ ,i])
par(mfcol=c(2,2))
print(summary(m))
print(plot(m))
}
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning in summary.lm(m): essentially perfect fit: summary may be unreliable
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.647e-14 -1.120e-15 -7.000e-16 -2.800e-16 1.614e-12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.756e-13 3.928e-15 4.470e+01 <2e-16 ***
## dfTrain_ImputedMedian[, i] 1.000e+00 4.775e-17 2.094e+16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.467e-14 on 2172 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 4.385e+32 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.761 -8.515 0.971 9.783 43.230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.686332 3.164963 5.588 2.58e-08 ***
## dfTrain_ImputedMedian[, i] 0.042775 0.002136 20.025 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.31 on 2172 degrees of freedom
## Multiple R-squared: 0.1558, Adjusted R-squared: 0.1555
## F-statistic: 401 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.863 -9.376 0.670 10.121 57.415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.346919 1.737969 32.42 <2e-16 ***
## dfTrain_ImputedMedian[, i] 0.100118 0.007005 14.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.89 on 2172 degrees of freedom
## Multiple R-squared: 0.08597, Adjusted R-squared: 0.08555
## F-statistic: 204.3 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -76.628 -8.980 1.143 10.428 60.940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.62804 0.72265 106.038 < 2e-16 ***
## dfTrain_ImputedMedian[, i] 0.07596 0.01180 6.439 1.48e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.43 on 2172 degrees of freedom
## Multiple R-squared: 0.01873, Adjusted R-squared: 0.01828
## F-statistic: 41.46 on 1 and 2172 DF, p-value: 1.477e-10
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.596 -9.734 0.553 10.041 68.954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.595947 0.658670 114.771 <2e-16 ***
## dfTrain_ImputedMedian[, i] 0.050009 0.005527 9.048 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.29 on 2172 degrees of freedom
## Multiple R-squared: 0.03632, Adjusted R-squared: 0.03588
## F-statistic: 81.87 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.936 -9.554 0.579 9.674 78.185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.935670 1.370076 48.13 <2e-16 ***
## dfTrain_ImputedMedian[, i] 0.029358 0.002635 11.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.15 on 2172 degrees of freedom
## Multiple R-squared: 0.05406, Adjusted R-squared: 0.05362
## F-statistic: 124.1 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.228 -9.308 0.963 10.609 63.772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.228036 1.043434 78.81 <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.001990 0.001344 -1.48 0.139
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared: 0.001008, Adjusted R-squared: 0.0005482
## F-statistic: 2.192 on 1 and 2172 DF, p-value: 0.1389
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.284 -9.080 1.024 10.198 65.160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.28444 0.57917 135.166 < 2e-16 ***
## dfTrain_ImputedMedian[, i] 0.02048 0.00392 5.226 1.9e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.48 on 2172 degrees of freedom
## Multiple R-squared: 0.01242, Adjusted R-squared: 0.01196
## F-statistic: 27.31 on 1 and 2172 DF, p-value: 1.899e-07
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.071 -9.493 1.233 10.483 65.236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.07067 0.98260 81.489 <2e-16 ***
## dfTrain_ImputedMedian[, i] 0.01314 0.01750 0.751 0.453
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared: 0.0002595, Adjusted R-squared: -0.0002008
## F-statistic: 0.5637 on 1 and 2172 DF, p-value: 0.4528
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.165 -9.462 0.897 10.651 68.914
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.0150688 0.5308401 156.384 < 2e-16 ***
## dfTrain_ImputedMedian[, i] -0.0012543 0.0002309 -5.432 6.2e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.47 on 2172 degrees of freedom
## Multiple R-squared: 0.0134, Adjusted R-squared: 0.01295
## F-statistic: 29.5 on 1 and 2172 DF, p-value: 6.205e-08
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.906 -9.846 0.705 9.965 67.942
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.905514 0.682649 109.728 <2e-16 ***
## dfTrain_ImputedMedian[, i] 0.053432 0.005465 9.777 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.25 on 2172 degrees of freedom
## Multiple R-squared: 0.04216, Adjusted R-squared: 0.04171
## F-statistic: 95.59 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.528 -9.251 0.948 10.415 70.006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.528116 1.149967 64.809 < 2e-16 ***
## dfTrain_ImputedMedian[, i] 0.011187 0.001975 5.664 1.68e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.46 on 2172 degrees of freedom
## Multiple R-squared: 0.01455, Adjusted R-squared: 0.0141
## F-statistic: 32.08 on 1 and 2172 DF, p-value: 1.678e-08
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.570 -9.402 0.970 10.484 63.430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.5704787 0.5945630 138.876 < 2e-16 ***
## dfTrain_ImputedMedian[, i] -0.0022085 0.0006023 -3.667 0.000252 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.53 on 2172 degrees of freedom
## Multiple R-squared: 0.006152, Adjusted R-squared: 0.005695
## F-statistic: 13.45 on 1 and 2172 DF, p-value: 0.0002515
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.638 -9.847 0.708 10.050 73.590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.645750 0.476605 175.503 <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.011815 0.001415 -8.352 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared: 0.03112, Adjusted R-squared: 0.03067
## F-statistic: 69.75 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.809 -9.322 1.075 10.459 65.191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.70498 2.23001 37.536 <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.01979 0.01484 -1.334 0.182
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared: 0.0008182, Adjusted R-squared: 0.0003582
## F-statistic: 1.779 on 1 and 2172 DF, p-value: 0.1825
## NULL
dfTrain_ImputedMedian <- dfTrain_ImputedMean_NoCohort
EHExplore_Correlations_Scatterplots <- function(df, y, flip=FALSE)
{
plot_list <- list()
df <- select_if(df, is.numeric)
for(i in 1:ncol(df)) {
ct <- cor.test(df[,i], df[,y])
xText <- str_c("Correlation: ", round(ct$estimate,2), " p value: ", round(ct$p.value,2))
x1 = df[[i]]
y1 =y
if(flip)
{
x1=y
y1=df[[i]]
}
p <- ggplot(df, aes_string(x1, y1)) +
geom_point(fill="navy", color="white") +
geom_smooth(method = "loess", color="red", fill="lightcoral") +
ylab(y) +
xlab(xText) +
theme(title = element_text(size=9), axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "slategray2", color="darkslategray")) +
ggtitle(colnames(df)[i])
p <- eval(substitute(p, list(i=i)))
plot_list[[i]] <- p
}
return(plot_list)
}
z4 <- EHExplore_Correlations_Scatterplots(dfTrain_ImputedMedian, "TARGET_WINS")
grid.arrange(grobs=z4[c(2:11)], ncol=3, nrow=5)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#grid.arrange(grobs=z4[c(11:16)], ncol=3, nrow=6)
EHExplore_IntegratePlotLists <-function(list1, list2, list3)
{
zz2 <- list()
for(i in 1:length(list1)) {
zz2[i*3-2] <- list1[i]
zz2[i*3-1] <- list2[i]
zz2[i*3] <- list3[i]
}
return(zz2)
}
zz1 <- list()
for(i in 1:length(z)) {
zz1[i*3-2] <- z[i]
zz1[i*3-1] <- z6[i]
zz1[i*3] <- z4[i]
}
grid.arrange(grobs=zz1[c(1:24)], ncol=3, nrow=8)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
grid.arrange(grobs=zz1[c(25:48)], ncol=3, nrow=8)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#grid.arrange(grobs=zz1[c(25:36)], ncol=3, nrow=4)
#grid.arrange(grobs=zz1[c(37:48)], ncol=3, nrow=4)
Trying a transformation on team fielding error. it improves it to some degree.
dfTrain_ImputedMedian2 <- dfTrain_ImputedMedian %>%
mutate(sq = FIELDING_E^2)
summary(lm(TARGET_WINS ~ FIELDING_E, dfTrain_ImputedMedian2))
##
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_E, data = dfTrain_ImputedMedian2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.638 -9.847 0.708 10.050 73.590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.645750 0.476605 175.503 <2e-16 ***
## FIELDING_E -0.011815 0.001415 -8.352 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared: 0.03112, Adjusted R-squared: 0.03067
## F-statistic: 69.75 on 1 and 2172 DF, p-value: < 2.2e-16
summary(lm(TARGET_WINS ~ FIELDING_E + sq, dfTrain_ImputedMedian2))
##
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_E + sq, data = dfTrain_ImputedMedian2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.981 -9.787 0.647 10.285 72.647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.985e+01 7.178e-01 111.246 < 2e-16 ***
## FIELDING_E 1.386e-02 3.924e-03 3.533 0.000419 ***
## sq -2.177e-05 3.108e-06 -7.005 3.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.17 on 2171 degrees of freedom
## Multiple R-squared: 0.05253, Adjusted R-squared: 0.05165
## F-statistic: 60.18 on 2 and 2171 DF, p-value: < 2.2e-16
#Two mods made - team pitching has the square temr and intreaction between hits and dp
par(mfcol=c(2,2))
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
summary(mod_2)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.264 -8.466 0.163 8.273 58.924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.9560970 5.4876280 4.365 1.33e-05 ***
## INDEX -0.0004771 0.0003788 -1.259 0.207988
## BATTING_H 0.0482928 0.0037112 13.013 < 2e-16 ***
## BATTING_2B -0.0232530 0.0092311 -2.519 0.011841 *
## BATTING_3B 0.0595670 0.0169134 3.522 0.000437 ***
## BATTING_HR 0.0655424 0.0272468 2.406 0.016234 *
## BATTING_BB 0.0084691 0.0057882 1.463 0.143567
## BATTING_SO -0.0100510 0.0025721 -3.908 9.61e-05 ***
## BASERUN_SB 0.0254437 0.0044746 5.686 1.47e-08 ***
## BASERUN_CS 0.0006521 0.0161429 0.040 0.967780
## PITCHING_H -0.0009865 0.0003651 -2.702 0.006949 **
## PITCHING_HR 0.0116273 0.0240289 0.484 0.628514
## PITCHING_BB 0.0014808 0.0040999 0.361 0.718000
## PITCHING_SO 0.0028141 0.0009069 3.103 0.001941 **
## FIELDING_E -0.0186779 0.0024906 -7.499 9.31e-14 ***
## FIELDING_DP -0.1091373 0.0136377 -8.003 1.97e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.86 on 2158 degrees of freedom
## Multiple R-squared: 0.3226, Adjusted R-squared: 0.3179
## F-statistic: 68.5 on 15 and 2158 DF, p-value: < 2.2e-16
plot(mod_2)
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
##
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
##
## area
## The following object is masked from 'package:dplyr':
##
## select
step1 <- stepAIC(mod_2, trace=FALSE)
summary(step1)
##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B +
## BATTING_HR + BATTING_BB + BATTING_SO + BASERUN_SB + PITCHING_H +
## PITCHING_SO + FIELDING_E + FIELDING_DP, data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.153 -8.411 0.176 8.307 58.465
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.6861348 5.2806294 4.296 1.82e-05 ***
## BATTING_H 0.0486089 0.0036841 13.194 < 2e-16 ***
## BATTING_2B -0.0233877 0.0092203 -2.537 0.011265 *
## BATTING_3B 0.0602198 0.0166990 3.606 0.000318 ***
## BATTING_HR 0.0770786 0.0097715 7.888 4.83e-15 ***
## BATTING_BB 0.0104799 0.0033563 3.122 0.001817 **
## BATTING_SO -0.0104007 0.0024834 -4.188 2.93e-05 ***
## BASERUN_SB 0.0253857 0.0042813 5.929 3.53e-09 ***
## PITCHING_H -0.0008928 0.0003178 -2.809 0.005008 **
## PITCHING_SO 0.0030690 0.0006625 4.633 3.82e-06 ***
## FIELDING_E -0.0184139 0.0024107 -7.639 3.28e-14 ***
## FIELDING_DP -0.1095211 0.0136173 -8.043 1.43e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.86 on 2162 degrees of freedom
## Multiple R-squared: 0.3218, Adjusted R-squared: 0.3184
## F-statistic: 93.27 on 11 and 2162 DF, p-value: < 2.2e-16
Understanding the role of double plays - remove the influence of hits:
ggplot(dfTrain_ImputedMedian, aes(FIELDING_DP, PITCHING_H)) +
geom_point()
ggplot(dfTrain, aes(FIELDING_DP, PITCHING_H)) +
geom_point()
## Warning: Removed 286 rows containing missing values (geom_point).
cor(dfTrain_ImputedMedian$FIELDING_DP, dfTrain_ImputedMedian$PITCHING_H)
## [1] -0.08895731
summary(lm(TARGET_WINS ~ FIELDING_DP + PITCHING_H, dfTrain))
##
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_DP + PITCHING_H, data = dfTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66.999 -9.102 0.739 10.013 43.146
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.0829610 2.4592867 30.530 < 2e-16 ***
## FIELDING_DP -0.0045343 0.0121655 -0.373 0.709
## PITCHING_H 0.0041845 0.0008319 5.030 5.34e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.85 on 1987 degrees of freedom
## (286 observations deleted due to missingness)
## Multiple R-squared: 0.01377, Adjusted R-squared: 0.01278
## F-statistic: 13.87 on 2 and 1987 DF, p-value: 1.038e-06
summary(lm(TARGET_WINS ~ FIELDING_DP + PITCHING_H, dfTrain_ImputedMedian))
##
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_DP + PITCHING_H, data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.237 -9.564 0.855 10.359 68.964
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.1139149 2.2975487 37.916 < 2e-16 ***
## FIELDING_DP -0.0271240 0.0147930 -1.834 0.0669 .
## PITCHING_H -0.0012921 0.0002317 -5.576 2.76e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.46 on 2171 degrees of freedom
## Multiple R-squared: 0.01493, Adjusted R-squared: 0.01402
## F-statistic: 16.45 on 2 and 2171 DF, p-value: 8.127e-08
summary(lm(TARGET_WINS ~ FIELDING_DP*PITCHING_H, dfTrain))
##
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_DP * PITCHING_H, data = dfTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.126 -9.261 1.004 9.713 47.202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.023e+02 5.724e+00 17.872 < 2e-16 ***
## FIELDING_DP -2.549e-01 4.914e-02 -5.188 2.35e-07 ***
## PITCHING_H -1.244e-02 3.269e-03 -3.806 0.000145 ***
## FIELDING_DP:PITCHING_H 1.561e-04 2.970e-05 5.257 1.62e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.76 on 1986 degrees of freedom
## (286 observations deleted due to missingness)
## Multiple R-squared: 0.02731, Adjusted R-squared: 0.02584
## F-statistic: 18.59 on 3 and 1986 DF, p-value: 6.864e-12
summary(lm(TARGET_WINS ~ FIELDING_DP*PITCHING_H, dfTrain_ImputedMedian))
##
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_DP * PITCHING_H, data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.162 -9.515 0.820 10.312 69.257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.833e+01 5.757e+00 13.607 <2e-16 ***
## FIELDING_DP 3.302e-02 3.906e-02 0.845 0.3981
## PITCHING_H 3.513e-03 2.898e-03 1.212 0.2256
## FIELDING_DP:PITCHING_H -3.328e-05 2.001e-05 -1.663 0.0964 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.46 on 2170 degrees of freedom
## Multiple R-squared: 0.01618, Adjusted R-squared: 0.01482
## F-statistic: 11.9 on 3 and 2170 DF, p-value: 9.984e-08
The interaction temr makes a difference.
Taking a log of Pitching_H:
ggplot(dfTrain_ImputedMedian, aes(dfTrain_ImputedMedian$PITCHING_H)) +
geom_histogram(bins=100)
## Warning: Use of `dfTrain_ImputedMedian$PITCHING_H` is discouraged. Use
## `PITCHING_H` instead.
dfTrain_ImputedMedian5 <- dfTrain_ImputedMedian2 %>%
mutate(logPitch_h = PITCHING_H^2)
ggplot(dfTrain_ImputedMedian5, aes(logPitch_h, TARGET_WINS)) +
stat_smooth(method=loess) +
geom_point()
## `geom_smooth()` using formula 'y ~ x'
m <- lm(TARGET_WINS ~ PITCHING_H + logPitch_h, dfTrain_ImputedMedian5)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H + logPitch_h, data = dfTrain_ImputedMedian5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.631 -9.694 1.045 10.242 64.174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.944e+01 9.013e-01 88.133 < 2e-16 ***
## PITCHING_H 1.126e-03 5.376e-04 2.094 0.0364 *
## logPitch_h -1.313e-07 2.682e-08 -4.897 1.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.39 on 2171 degrees of freedom
## Multiple R-squared: 0.02418, Adjusted R-squared: 0.02328
## F-statistic: 26.9 on 2 and 2171 DF, p-value: 2.895e-12
plot(m)
A closer look at Pitching_h. Taking out th outliers.
dfTrain_ImputedMedian6 <- dfTrain_ImputedMedian5 %>%
dplyr::filter(PITCHING_H <= 1500)
dfTrain_ImputedMedian7 <- dfTrain_ImputedMedian5 %>%
dplyr::filter(PITCHING_H > 2000)
ggplot(dfTrain_ImputedMedian6, aes(PITCHING_H, TARGET_WINS)) +
stat_smooth(method=loess) +
geom_point()
## `geom_smooth()` using formula 'y ~ x'
m <- lm(TARGET_WINS ~ PITCHING_H, dfTrain_ImputedMedian6)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H, data = dfTrain_ImputedMedian6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.864 -8.396 0.413 8.870 30.267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.26774 8.46728 0.976 0.329
## PITCHING_H 0.04990 0.00602 8.289 3.78e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.22 on 970 degrees of freedom
## Multiple R-squared: 0.06614, Adjusted R-squared: 0.06518
## F-statistic: 68.7 on 1 and 970 DF, p-value: 3.785e-16
plot(m)
ggplot(dfTrain_ImputedMedian7, aes(PITCHING_H, TARGET_WINS)) +
stat_smooth(method=loess) +
geom_point()
## `geom_smooth()` using formula 'y ~ x'
m <- lm(TARGET_WINS ~ PITCHING_H, dfTrain_ImputedMedian7)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H, data = dfTrain_ImputedMedian7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.879 -13.887 2.392 15.885 65.947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.487384 2.180193 41.50 < 2e-16 ***
## PITCHING_H -0.002207 0.000418 -5.28 2.77e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.87 on 255 degrees of freedom
## Multiple R-squared: 0.09855, Adjusted R-squared: 0.09502
## F-statistic: 27.88 on 1 and 255 DF, p-value: 2.767e-07
plot(m)
ggplot(dfTrain_ImputedMedian, aes(PITCHING_H, TARGET_WINS)) +
stat_smooth(method=loess) +
geom_point()
## `geom_smooth()` using formula 'y ~ x'
m <- lm(TARGET_WINS ~ PITCHING_H, dfTrain_ImputedMedian)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H, data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.165 -9.462 0.897 10.651 68.914
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.0150688 0.5308401 156.384 < 2e-16 ***
## PITCHING_H -0.0012543 0.0002309 -5.432 6.2e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.47 on 2172 degrees of freedom
## Multiple R-squared: 0.0134, Adjusted R-squared: 0.01295
## F-statistic: 29.5 on 1 and 2172 DF, p-value: 6.205e-08
plot(m)
Eliminting outliers has no effect - but show outliers seem to be grouped (compare new outliers with old):
dfTrain_ImputedMedian_nooutliers <- dfTrain_ImputedMedian %>%
dplyr::filter(INDEX != 1211 & INDEX != 1342 & INDEX != 1810)
m <- lm(TARGET_WINS ~ PITCHING_H, dfTrain_ImputedMedian_nooutliers)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H, data = dfTrain_ImputedMedian_nooutliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.170 -9.460 0.889 10.636 68.905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.0181857 0.5306250 156.45 < 2e-16 ***
## PITCHING_H -0.0012530 0.0002307 -5.43 6.26e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.46 on 2169 degrees of freedom
## Multiple R-squared: 0.01341, Adjusted R-squared: 0.01296
## F-statistic: 29.49 on 1 and 2169 DF, p-value: 6.263e-08
plot(m)
looking for interactions:
dfTrain_ImputedMedian8 <- dfTrain_ImputedMedian %>%
mutate(Pitch_h_Under1500 = ifelse(PITCHING_H<=1500, 1, 0))
ghi <- EHExplore_Interactions_Scatterplots(dfTrain_ImputedMedian8, "TARGET_WINS", "Pitch_h_Under1500")
grid.arrange(grobs=ghi[c(1:8)], ncol=2, nrow=4)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
grid.arrange(grobs=ghi[c(9:16)], ncol=2, nrow=4)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Similar analysis with the data missing records:
dfTrain_flag <- dfTrain2 %>%
mutate(Missing_Flag = ifelse(is.na(BATTING_SO),1,0))
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_flag)
step1 <- stepAIC(mod_2, trace=FALSE)
summary(step1)
##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_HBP + PITCHING_HR +
## PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP, data = dfTrain_flag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2248 -5.6294 -0.0212 5.0439 21.3065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.95454 19.10292 3.191 0.001670 **
## BATTING_H 0.02541 0.01009 2.518 0.012648 *
## BATTING_HBP 0.08712 0.04852 1.796 0.074211 .
## PITCHING_HR 0.08945 0.02394 3.736 0.000249 ***
## PITCHING_BB 0.05672 0.00940 6.034 8.66e-09 ***
## PITCHING_SO -0.03136 0.00728 -4.308 2.68e-05 ***
## FIELDING_E -0.17218 0.03970 -4.338 2.38e-05 ***
## FIELDING_DP -0.11904 0.03516 -3.386 0.000869 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.422 on 183 degrees of freedom
## (2080 observations deleted due to missingness)
## Multiple R-squared: 0.5345, Adjusted R-squared: 0.5167
## F-statistic: 30.02 on 7 and 183 DF, p-value: < 2.2e-16
Only interaction appears with the fielding_errors. Hwoever, If we interact with itself it greatly improves the r squared.
dfTrain_ImputedMedian9 <- dfTrain_ImputedMedian8 %>%
mutate(Pitch_h_squared = PITCHING_H^2) %>%
mutate(Pitch_h_log = log(PITCHING_H)) %>%
mutate(Pitch_h_sqrt = sqrt(PITCHING_H))
summary(lm(TARGET_WINS ~ Pitch_h_squared, dfTrain_ImputedMedian9))
##
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_squared, data = dfTrain_ImputedMedian9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.015 -9.069 0.997 10.158 66.609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.119e+01 3.359e-01 241.736 < 2e-16 ***
## Pitch_h_squared -8.054e-08 1.147e-08 -7.024 2.88e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.4 on 2172 degrees of freedom
## Multiple R-squared: 0.02221, Adjusted R-squared: 0.02176
## F-statistic: 49.33 on 1 and 2172 DF, p-value: 2.883e-12
summary(lm(TARGET_WINS ~ Pitch_h_log, dfTrain_ImputedMedian9))
##
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_log, data = dfTrain_ImputedMedian9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.408 -9.582 1.145 10.356 66.161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.2807 7.9389 10.994 <2e-16 ***
## Pitch_h_log -0.8795 1.0706 -0.822 0.411
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared: 0.0003106, Adjusted R-squared: -0.0001496
## F-statistic: 0.6749 on 1 and 2172 DF, p-value: 0.4114
summary(lm(TARGET_WINS ~ Pitch_h_sqrt, dfTrain_ImputedMedian9))
##
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_sqrt, data = dfTrain_ImputedMedian9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.753 -9.477 0.982 10.732 68.378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85.48013 1.47144 58.09 < 2e-16 ***
## Pitch_h_sqrt -0.11429 0.03474 -3.29 0.00102 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.54 on 2172 degrees of freedom
## Multiple R-squared: 0.00496, Adjusted R-squared: 0.004501
## F-statistic: 10.83 on 1 and 2172 DF, p-value: 0.001017
m <- lm(TARGET_WINS ~ PITCHING_H*Pitch_h_Under1500, dfTrain_ImputedMedian8)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H * Pitch_h_Under1500, data = dfTrain_ImputedMedian8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.864 -9.153 0.979 9.772 67.940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.643e+01 6.550e-01 131.965 < 2e-16 ***
## PITCHING_H -1.771e-03 2.322e-04 -7.628 3.55e-14 ***
## Pitch_h_Under1500 -7.816e+01 1.047e+01 -7.466 1.19e-13 ***
## PITCHING_H:Pitch_h_Under1500 5.167e-02 7.432e-03 6.952 4.76e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.08 on 2170 degrees of freedom
## Multiple R-squared: 0.06361, Adjusted R-squared: 0.06232
## F-statistic: 49.14 on 3 and 2170 DF, p-value: < 2.2e-16
plot(m)
summary(lm(TARGET_WINS ~ FIELDING_E*Pitch_h_Under1500, dfTrain_ImputedMedian9))
##
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_E * Pitch_h_Under1500, data = dfTrain_ImputedMedian9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.182 -9.571 0.598 9.826 73.499
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.867745 0.643380 136.572 < 2e-16 ***
## FIELDING_E -0.016158 0.001498 -10.787 < 2e-16 ***
## Pitch_h_Under1500 -0.776515 1.469068 -0.529 0.597
## FIELDING_E:Pitch_h_Under1500 -0.042078 0.008364 -5.031 5.28e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.88 on 2170 degrees of freedom
## Multiple R-squared: 0.08892, Adjusted R-squared: 0.08766
## F-statistic: 70.59 on 3 and 2170 DF, p-value: < 2.2e-16
summary(lm(TARGET_WINS ~ FIELDING_E, dfTrain_ImputedMedian9))
##
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_E, data = dfTrain_ImputedMedian9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.638 -9.847 0.708 10.050 73.590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.645750 0.476605 175.503 <2e-16 ***
## FIELDING_E -0.011815 0.001415 -8.352 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared: 0.03112, Adjusted R-squared: 0.03067
## F-statistic: 69.75 on 1 and 2172 DF, p-value: < 2.2e-16
Final Mods:
dfTrain_ImputedMedian8$Pitch_h_Under1500 <- as.numeric(dfTrain_ImputedMedian8$Pitch_h_Under1500)
dfTrain_Final <- dfTrain_ImputedMean_NoCohort %>%
mutate(Pitch_h_Under1500 = ifelse(PITCHING_H<=1500, 1, 0)) %>%
mutate(Prod_DP_H = FIELDING_DP*PITCHING_H) %>%
mutate(inter_H_Itself = PITCHING_H*Pitch_h_Under1500) %>%
mutate(Inter_H_Err = FIELDING_E*Pitch_h_Under1500) %>%
mutate(PITCHING_H = PITCHING_H) %>%
mutate(E_sq = FIELDING_E^2) %>%
mutate(BB_sq = -1*BATTING_BB^2) %>%
mutate(BHR_sq = -1*BATTING_HR^2) %>%
mutate(BSO_sq = -1*BATTING_SO^2) %>%
mutate(PH_sq = -1*PITCHING_H^2) %>%
mutate(PSO_sq = -PITCHING_SO^2)
dfTrain_ImputedMean$Missing_Flag <- as.numeric(dfTrain_ImputedMean$Missing_Flag)
dfTrain_Final2 <- dfTrain_ImputedMean %>%
mutate(Pitch_h_Under1500 = ifelse(PITCHING_H<=1500, 1, 0)) %>%
mutate(Prod_DP_H = FIELDING_DP*PITCHING_H) %>%
mutate(inter_H_Itself = PITCHING_H*Pitch_h_Under1500) %>%
mutate(Inter_H_Err = FIELDING_E*Pitch_h_Under1500) %>%
mutate(E_sq = FIELDING_E^2) %>%
mutate(BB_sq = -1*BATTING_BB^2) %>%
mutate(BHR_sq = -1*BATTING_HR^2) %>%
mutate(BSO_sq = -1*BATTING_SO^2) %>%
mutate(PH_sq = -1*PITCHING_H^2) %>%
mutate(PSO_sq = -PITCHING_SO^2) %>%
mutate(Inter_h_Cohort = PITCHING_H*Missing_Flag) %>%
mutate(Inter_bb_Cohort = PITCHING_BB*Missing_Flag) %>%
mutate(Inter_hr_Cohort = PITCHING_HR*Missing_Flag) %>%
mutate(Inter_E_Cohort = FIELDING_E*Missing_Flag) %>%
mutate(Inter_bh_Cohort = BATTING_H*Missing_Flag) %>%
mutate(Inter_bhr_Cohort = BATTING_HR*Missing_Flag) %>%
mutate(Inter_bbb_Cohort = BATTING_BB*Missing_Flag) %>%
mutate(Inter_bs_Cohort = BASERUN_SB*Missing_Flag)
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_ImputedMean_NoCohort)
step2 <- stepAIC(mod_2, trace=FALSE)
#summary(step2)
par(mfcol=c(2,2))
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_Final)
step3 <- stepAIC(mod_2, trace=FALSE)
#summary(step3)
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_Final2)
step4 <- stepAIC(mod_2, trace=FALSE)
summary(step4)
##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B +
## BATTING_BB + BATTING_SO + BASERUN_SB + PITCHING_H + PITCHING_BB +
## FIELDING_E + FIELDING_DP + Missing_Flag + Pitch_h_Under1500 +
## inter_H_Itself + Inter_H_Err + BB_sq + BHR_sq + BSO_sq +
## PH_sq + Inter_E_Cohort + Inter_bhr_Cohort + Inter_bbb_Cohort +
## Inter_bs_Cohort, data = dfTrain_Final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.224 -7.883 0.383 7.828 58.494
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.862e+01 7.211e+00 6.743 1.96e-11 ***
## BATTING_H 5.651e-02 3.809e-03 14.836 < 2e-16 ***
## BATTING_2B -1.332e-02 8.793e-03 -1.515 0.129953
## BATTING_3B 8.266e-02 1.590e-02 5.199 2.18e-07 ***
## BATTING_BB -2.197e-01 1.816e-02 -12.094 < 2e-16 ***
## BATTING_SO 4.104e-02 7.230e-03 5.676 1.55e-08 ***
## BASERUN_SB 3.979e-02 4.284e-03 9.288 < 2e-16 ***
## PITCHING_H -3.941e-03 1.044e-03 -3.776 0.000164 ***
## PITCHING_BB 1.909e-02 3.295e-03 5.793 7.87e-09 ***
## FIELDING_E -3.595e-02 2.968e-03 -12.115 < 2e-16 ***
## FIELDING_DP -8.629e-02 1.282e-02 -6.729 2.16e-11 ***
## Missing_Flag 2.730e+01 1.156e+01 2.362 0.018238 *
## Pitch_h_Under1500 3.344e+01 9.425e+00 3.548 0.000396 ***
## inter_H_Itself -1.719e-02 6.503e-03 -2.643 0.008276 **
## Inter_H_Err -3.658e-02 7.000e-03 -5.226 1.90e-07 ***
## BB_sq -2.041e-04 1.589e-05 -12.848 < 2e-16 ***
## BHR_sq -2.731e-04 3.515e-05 -7.767 1.21e-14 ***
## BSO_sq 3.267e-05 4.696e-06 6.957 4.55e-12 ***
## PH_sq -8.681e-08 3.439e-08 -2.524 0.011660 *
## Inter_E_Cohort -1.997e-01 2.698e-02 -7.401 1.89e-13 ***
## Inter_bhr_Cohort 3.348e-01 1.599e-01 2.094 0.036350 *
## Inter_bbb_Cohort 4.961e-02 1.895e-02 2.618 0.008908 **
## Inter_bs_Cohort 4.692e-02 2.717e-02 1.727 0.084256 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.22 on 2253 degrees of freedom
## Multiple R-squared: 0.4041, Adjusted R-squared: 0.3983
## F-statistic: 69.44 on 22 and 2253 DF, p-value: < 2.2e-16
summary(step2)$adj.r.squared
## [1] 0.3183679
summary(step3)$adj.r.squared
## [1] 0.3776131
summary(step4)$adj.r.squared
## [1] 0.398272
Checking interactions with the missing vaolues cohort:
looking for interactions:
dfTrain_ImputedMean_NoCohort1 <- dfTrain_ImputedMean_NoCohort %>%
mutate(BB_sq = -1*BATTING_BB^2)
summary(lm(TARGET_WINS ~ BATTING_BB, dfTrain_ImputedMean_NoCohort1))
##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_BB, data = dfTrain_ImputedMean_NoCohort1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.936 -9.554 0.579 9.674 78.185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.935670 1.370076 48.13 <2e-16 ***
## BATTING_BB 0.029358 0.002635 11.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.15 on 2172 degrees of freedom
## Multiple R-squared: 0.05406, Adjusted R-squared: 0.05362
## F-statistic: 124.1 on 1 and 2172 DF, p-value: < 2.2e-16
summary(lm(TARGET_WINS ~ BATTING_BB + BB_sq, dfTrain_ImputedMean_NoCohort1))
##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_BB + BB_sq, data = dfTrain_ImputedMean_NoCohort1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.421 -9.315 0.582 9.742 72.271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.442e+01 2.462e+00 30.229 < 2e-16 ***
## BATTING_BB -1.398e-02 1.079e-02 -1.296 0.195
## BB_sq -4.958e-05 1.197e-05 -4.142 3.58e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.09 on 2171 degrees of freedom
## Multiple R-squared: 0.06147, Adjusted R-squared: 0.06061
## F-statistic: 71.1 on 2 and 2171 DF, p-value: < 2.2e-16
pitching SO has 20 zeroes which looks like missing values. Also, eliminate the 0 wins record.
x <- dfTrain_ImputedMean_NoCohort %>%
filter(PITCHING_SO == 0)
x
## INDEX TARGET_WINS BATTING_H BATTING_2B BATTING_3B BATTING_HR BATTING_BB
## 1 325 120 2270 301 132 42 74
## 2 326 146 2305 322 111 29 64
## 3 435 65 1464 147 32 3 94
## 4 459 23 1458 220 35 0 93
## 5 952 77 1895 244 8 8 93
## 6 953 73 1685 206 31 0 58
## 7 1106 49 1794 281 58 6 79
## 8 1107 107 1725 194 67 4 79
## 9 1347 0 891 135 0 0 0
## 10 1498 24 1289 145 41 7 45
## 11 1502 105 1767 249 77 20 95
## 12 1503 71 1491 200 57 17 50
## 13 2037 97 1903 256 50 18 71
## 14 2038 118 2086 280 135 22 89
## 15 2048 81 1927 207 142 8 78
## 16 2049 88 1622 155 67 12 52
## 17 2253 34 1177 171 9 0 119
## 18 2254 93 1527 200 64 0 79
## 19 2486 12 1009 112 75 0 12
## 20 2493 29 1122 69 64 0 29
## BATTING_SO BASERUN_SB BASERUN_CS PITCHING_H PITCHING_HR PITCHING_BB
## 1 0 124.7618 52.80386 5253 97 171
## 2 0 124.7618 52.80386 4727 59 131
## 3 0 124.7618 52.80386 4312 9 277
## 4 0 124.7618 52.80386 16871 0 1076
## 5 0 124.7618 52.80386 5203 22 255
## 6 0 124.7618 52.80386 4074 0 140
## 7 0 124.7618 52.80386 5484 18 241
## 8 0 124.7618 52.80386 3408 8 156
## 9 0 0.0000 0.00000 24057 0 0
## 10 0 124.7618 52.80386 4443 24 155
## 11 0 124.7618 52.80386 4404 50 237
## 12 0 124.7618 52.80386 3552 41 119
## 13 0 124.7618 52.80386 5605 53 209
## 14 0 124.7618 52.80386 4629 49 198
## 15 0 124.7618 52.80386 5382 22 218
## 16 0 124.7618 52.80386 3864 29 124
## 17 0 124.7618 52.80386 10035 0 1015
## 18 0 124.7618 52.80386 3638 0 188
## 19 0 124.7618 52.80386 12574 0 150
## 20 0 124.7618 52.80386 6492 0 168
## PITCHING_SO FIELDING_E FIELDING_DP
## 1 0 1058 146.3879
## 2 0 951 146.3879
## 3 0 1473 146.3879
## 4 0 1898 146.3879
## 5 0 1225 146.3879
## 6 0 931 146.3879
## 7 0 1531 146.3879
## 8 0 853 146.3879
## 9 0 1890 146.3879
## 10 0 1506 146.3879
## 11 0 1092 146.3879
## 12 0 1253 146.3879
## 13 0 1166 146.3879
## 14 0 928 146.3879
## 15 0 1447 146.3879
## 16 0 1132 146.3879
## 17 0 1279 146.3879
## 18 0 1010 146.3879
## 19 0 847 146.3879
## 20 0 1522 146.3879
data(teengamb, package="faraway")
dfTeenGamb <- teengamb
dfTeenGamb$gamble <- log(dfTeenGamb$gamble+1)
qq <- EHExplore_Outliers_Boxplots(dfTeenGamb)
qq1 <- EHExplore_Correlations_Scatterplots(dfTeenGamb, "gamble")
qq2 <- EHExplore_Distributions_Histograms(dfTeenGamb, nbins=50)
abc <- EHExplore_IntegratePlotLists(qq,qq1,qq2)
grid.arrange(grobs=abc[c(1:15)], ncol=3, nrow=5)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1.01
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
def <- EHExplore_Interactions_Scatterplots(dfTeenGamb, "gamble", "sex")
grid.arrange(grobs=def[c(2:5)], ncol=2, nrow=3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
summary(lm(gamble ~ ., dfTeenGamb))
##
## Call:
## lm(formula = gamble ~ ., data = dfTeenGamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35012 -0.56865 0.00413 0.71512 1.90319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.71620 0.82240 2.087 0.0430 *
## sex -0.87120 0.39268 -2.219 0.0320 *
## status 0.02983 0.01344 2.219 0.0320 *
## income 0.21565 0.04904 4.398 7.33e-05 ***
## verbal -0.26165 0.10388 -2.519 0.0157 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.085 on 42 degrees of freedom
## Multiple R-squared: 0.5206, Adjusted R-squared: 0.475
## F-statistic: 11.4 on 4 and 42 DF, p-value: 2.347e-06
summary(lm(gamble ~ status*sex + income*sex + verbal*sex, dfTeenGamb))
##
## Call:
## lm(formula = gamble ~ status * sex + income * sex + verbal *
## sex, data = dfTeenGamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.36611 -0.52121 -0.05909 0.77500 1.90245
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.833907 0.924205 1.984 0.0543 .
## status 0.027403 0.017390 1.576 0.1232
## sex -1.248394 1.838417 -0.679 0.5011
## income 0.239965 0.055271 4.342 9.73e-05 ***
## verbal -0.278137 0.127264 -2.186 0.0349 *
## status:sex 0.003361 0.028806 0.117 0.9077
## sex:income -0.119830 0.127150 -0.942 0.3518
## sex:verbal 0.113498 0.241114 0.471 0.6405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.1 on 39 degrees of freedom
## Multiple R-squared: 0.5424, Adjusted R-squared: 0.4603
## F-statistic: 6.604 on 7 and 39 DF, p-value: 3.603e-05
# This is the connection string:
library(RODBC)
strConnection <- paste0(
'Driver={ODBC Driver 13 for SQL Server};
Server=tcp:ehtmp.database.windows.net,1433;
Database=HC_A1C;
Encrypt=yes;
TrustServerCertificate=no;
Connection Timeout=30;
Uid=datany2021;
Pwd=MSinDS123;'
)
dbConnection <- odbcDriverConnect(strConnection)
dfA1C <- sqlQuery(dbConnection, "SELECT * FROM tblA1C")
dfA1C %<>%
mutate(A1CDropPerCent=-1*(MostrecentA1C - DiagA1C)/DiagA1C) %<>%
mutate(A1CDrop= -1*(MostrecentA1C - DiagA1C)) %>%
mutate(Improved = case_when(A1CDrop > 0 ~ 1,
A1CDrop <= 0 ~ 0)) %>%
filter(A1CDrop<7.4)
skim(dfA1C)
| Name | dfA1C |
| Number of rows | 284 |
| Number of columns | 33 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 31 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| DiagDaigDateA1C | 0 | 1 | 9 | 9 | 0 | 214 | 0 |
| MostRecentDate | 0 | 1 | 9 | 9 | 0 | 222 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| rownames | 0 | 1 | 142.88 | 82.65 | 1.00 | 71.75 | 142.50 | 214.25 | 286.00 | ▇▇▇▇▇ |
| ïRandom | 0 | 1 | 253.75 | 85.55 | 100.16 | 177.31 | 258.58 | 323.01 | 397.03 | ▆▆▇▇▆ |
| ClientID | 0 | 1 | 20621.68 | 12223.58 | 6312.00 | 10263.50 | 18425.00 | 25351.25 | 52278.00 | ▇▅▂▂▂ |
| Has_Coaching | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| BlackOrNot | 0 | 1 | 0.49 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| Immigrant | 0 | 1 | 0.20 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| SpanishSpeaking | 0 | 1 | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| Age | 0 | 1 | 58.23 | 11.45 | 25.00 | 53.00 | 59.00 | 65.00 | 90.00 | ▁▂▇▃▁ |
| MaleOrNot | 0 | 1 | 0.40 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| Hspanic1 | 0 | 1 | 0.30 | 0.46 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| US_Born | 0 | 1 | 0.40 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| NumOfEduOrCoach | 0 | 1 | 0.69 | 1.44 | 0.00 | 0.00 | 0.00 | 1.00 | 11.00 | ▇▁▁▁▁ |
| NumA1CScreens | 0 | 1 | 3.74 | 2.34 | 2.00 | 2.00 | 3.00 | 5.00 | 19.00 | ▇▁▁▁▁ |
| Linked | 0 | 1 | 0.95 | 0.21 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▁▁▇ |
| X6Monthlink | 0 | 1 | 0.74 | 0.44 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
| SICares | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| SDOH_Poverty | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| SDOH_AnyButHunger | 0 | 1 | 0.11 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| SDOH_Education | 0 | 1 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| HasNutritionCoaching | 0 | 1 | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| MedaAdherence | 0 | 1 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| Qorkingwithhealthcareprovider | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| NumberofQTACS | 0 | 1 | 1.94 | 3.24 | 0.00 | 0.00 | 0.00 | 5.00 | 15.00 | ▇▂▁▁▁ |
| Newly_Diagnosed | 0 | 1 | 0.37 | 0.48 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| DiagA1C | 0 | 1 | 8.53 | 1.98 | 5.40 | 7.00 | 7.90 | 9.50 | 14.70 | ▇▇▃▂▁ |
| DaysSinceMostRecentA1C | 0 | 1 | 985.35 | 526.60 | 272.00 | 506.00 | 784.00 | 1406.75 | 2137.00 | ▇▅▂▃▃ |
| MostrecentA1C | 0 | 1 | 7.89 | 2.26 | 4.00 | 6.18 | 7.20 | 9.03 | 15.00 | ▆▇▅▂▁ |
| Lag | 0 | 1 | 21.98 | 16.79 | 3.00 | 8.00 | 16.00 | 32.00 | 65.00 | ▇▃▂▂▂ |
| A1CDropPerCent | 0 | 1 | 0.07 | 0.19 | -0.67 | -0.03 | 0.09 | 0.20 | 0.55 | ▁▂▆▇▂ |
| A1CDrop | 0 | 1 | 0.65 | 1.72 | -6.00 | -0.20 | 0.60 | 1.70 | 6.40 | ▁▂▇▅▁ |
| Improved | 0 | 1 | 0.67 | 0.47 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
a1 <- EHExplore_Outliers_Boxplots(dfA1C)
a2 <- EHExplore_Distributions_Histograms(dfA1C)
a3 <- EHExplore_Correlations_Scatterplots(dfA1C, "A1CDrop")
a4 <- EHExplore_IntegratePlotLists(a1, a2, a3)
grid.arrange(grobs=a4[c(1:12)], nrow=4, ncol=3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
grid.arrange(grobs=a4[c(13:24)], nrow=4, ncol=3)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 9.2455e-031
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 9.2455e-031
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1.01
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## `geom_smooth()` using formula 'y ~ x'
grid.arrange(grobs=a4[c(25:36)], nrow=4, ncol=3)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 4.9695e-030
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 4.9695e-030
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1.01
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1.01
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 4.9695e-030
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 4.9695e-030
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1.01
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.055
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.055
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1
grid.arrange(grobs=a4[c(37:48)], nrow=4, ncol=3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 3.1761e-031
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 3.1761e-031
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1.01
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
grid.arrange(grobs=a4[c(49:60)], nrow=4, ncol=3)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
grid.arrange(grobs=a4[c(61:72)], nrow=4, ncol=3)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 4.526e-030
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 4.526e-030
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1.01
grid.arrange(grobs=a4[c(73:84)], nrow=4, ncol=3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
grid.arrange(grobs=a4[c(85:93)], nrow=4, ncol=3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.7472e-031
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 5.7472e-031
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1.01
data(happy, package="faraway")
dfHappy <- happy
summary(happy)
## happy money sex love
## Min. : 2.000 Min. : 0.00 Min. :0.0000 Min. :1.000
## 1st Qu.: 5.000 1st Qu.: 42.50 1st Qu.:0.0000 1st Qu.:2.000
## Median : 7.000 Median : 50.00 Median :1.0000 Median :3.000
## Mean : 6.744 Mean : 62.15 Mean :0.6923 Mean :2.462
## 3rd Qu.: 8.000 3rd Qu.: 78.00 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :10.000 Max. :175.00 Max. :1.0000 Max. :3.000
## work
## Min. :1.000
## 1st Qu.:3.000
## Median :4.000
## Mean :3.359
## 3rd Qu.:4.000
## Max. :5.000
w <- EHExplore_Outliers_Boxplots(dfHappy)
w2 <- EHExplore_Distributions_Histograms(dfHappy)
w3 <- EHExplore_Correlations_Scatterplots(dfHappy, y="happy")
w4 <- EHExplore_IntegratePlotLists(w, w2, w3)
grid.arrange(grobs=w4[c(1:15)], ncol=3, nrow=5)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.005
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1.01
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.99
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 2.0571e-016
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.0201
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 0.99
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 2.01
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 2.0571e-016
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1.0201
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 5.02
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.02
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 7.3994e-017
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 5.02
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 2.02
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 7.3994e-017
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1
m1 <- lm(happy ~ ., dfHappy)
EHModel_PrintSummary(m1)
##
## Call:
## lm(formula = happy ~ ., data = dfHappy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7186 -0.5779 -0.1172 0.6340 2.0651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.072081 0.852543 -0.085 0.9331
## money 0.009578 0.005213 1.837 0.0749 .
## sex -0.149008 0.418525 -0.356 0.7240
## love 1.919279 0.295451 6.496 1.97e-07 ***
## work 0.476079 0.199389 2.388 0.0227 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.058 on 34 degrees of freedom
## Multiple R-squared: 0.7102, Adjusted R-squared: 0.6761
## F-statistic: 20.83 on 4 and 34 DF, p-value: 9.364e-09
## NULL
wrap_plots(EHExplore_Interactions_Scatterplots(dfHappy, "happy", "sex"))
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
summary(lm(happy ~ work*sex + love + money, dfHappy))
##
## Call:
## lm(formula = happy ~ work * sex + love + money, data = dfHappy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.15748 -0.82393 -0.04975 0.66071 2.03120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.198650 1.365571 -0.878 0.3864
## work 0.794734 0.361747 2.197 0.0352 *
## sex 1.411580 1.537161 0.918 0.3651
## love 1.883332 0.296924 6.343 3.53e-07 ***
## money 0.009403 0.005207 1.806 0.0801 .
## work:sex -0.424339 0.402232 -1.055 0.2991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.057 on 33 degrees of freedom
## Multiple R-squared: 0.7197, Adjusted R-squared: 0.6772
## F-statistic: 16.94 on 5 and 33 DF, p-value: 2.732e-08
data(divusa, package="faraway")
dfDivusa <- divusa
summary(dfDivusa)
## year divorce unemployed femlab
## Min. :1920 Min. : 6.10 Min. : 1.200 Min. :22.70
## 1st Qu.:1939 1st Qu.: 8.70 1st Qu.: 4.200 1st Qu.:27.47
## Median :1958 Median :10.60 Median : 5.600 Median :37.10
## Mean :1958 Mean :13.27 Mean : 7.173 Mean :38.58
## 3rd Qu.:1977 3rd Qu.:20.30 3rd Qu.: 7.500 3rd Qu.:47.80
## Max. :1996 Max. :22.80 Max. :24.900 Max. :59.30
## marriage birth military
## Min. : 49.70 Min. : 65.30 Min. : 1.940
## 1st Qu.: 61.90 1st Qu.: 68.90 1st Qu.: 3.469
## Median : 74.10 Median : 85.90 Median : 9.102
## Mean : 72.97 Mean : 88.89 Mean :12.365
## 3rd Qu.: 80.00 3rd Qu.:107.30 3rd Qu.:14.266
## Max. :118.10 Max. :122.90 Max. :86.641
wrap_plots(EHExplore_Correlations_Scatterplots(dfDivusa,"year", flip=TRUE))
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
wrap_plots(EHExplore_Correlations_Scatterplots(dfDivusa,"divorce"))
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
EHModel_PrintSummary(lm(divorce ~ ., dfDivusa))
##
## Call:
## lm(formula = divorce ~ ., data = dfDivusa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9087 -0.9212 -0.0935 0.7447 3.4689
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 380.14761 99.20371 3.832 0.000274 ***
## year -0.20312 0.05333 -3.809 0.000297 ***
## unemployed -0.04933 0.05378 -0.917 0.362171
## femlab 0.80793 0.11487 7.033 1.09e-09 ***
## marriage 0.14977 0.02382 6.287 2.42e-08 ***
## birth -0.11695 0.01470 -7.957 2.19e-11 ***
## military -0.04276 0.01372 -3.117 0.002652 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.513 on 70 degrees of freedom
## Multiple R-squared: 0.9344, Adjusted R-squared: 0.9288
## F-statistic: 166.2 on 6 and 70 DF, p-value: < 2.2e-16
## NULL
dfHomeSales <- read.csv("D:\\RStudio\\605_Final\\Final\\housing\\train.csv", stringsAsFactors=TRUE, header = TRUE)
zs <- EHExplore_Outliers_Boxplots(dfHomeSales)
zs1 <- EHExplore_Distributions_Histograms(dfHomeSales)
zs2 <- EHExplore_Correlations_Scatterplots(dfHomeSales,"SalePrice")
zs4 <- EHExplore_IntegratePlotLists(zs, zs1, zs2)
grid.arrange(grobs=zs4[c(1:12)], ncol=3, now=4)
grid.arrange(grobs=zs4[c(13:24)], ncol=3, now=4)
grid.arrange(grobs=zs4[c(25:36)], ncol=3, now=4)
grid.arrange(grobs=zs4[c(37:48)], ncol=3, now=4)
grid.arrange(grobs=zs4[c(49:60)], ncol=3, now=4)
grid.arrange(grobs=zs4[c(61:72)], ncol=3, now=4)
grid.arrange(grobs=zs4[c(73:84)], ncol=3, now=4)
grid.arrange(grobs=zs4[c(85:96)], ncol=3, now=4)
grid.arrange(grobs=zs4[c(97:108)], ncol=3, now=4)
grid.arrange(grobs=zs4[c(109:114)], ncol=3, now=2)
EHExplore_Interactions_Scatterplots(dfHomeSales, "SalePrice", "OverallQual")
## [[1]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[3]]
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 259 rows containing non-finite values (stat_smooth).
## Warning: Removed 259 rows containing missing values (geom_point).
##
## [[4]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[5]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[6]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[7]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[8]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[9]]
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
##
## [[10]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[11]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[12]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[13]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[14]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[15]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[16]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[17]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[18]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[19]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[20]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[21]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[22]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[23]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[24]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[25]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[26]]
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 81 rows containing non-finite values (stat_smooth).
## Warning: Removed 81 rows containing missing values (geom_point).
##
## [[27]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[28]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[29]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[30]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[31]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[32]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[33]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[34]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[35]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[36]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[37]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[38]]
## `geom_smooth()` using formula 'y ~ x'
EHExplore_Interactions_Scatterplots(dfHomeSales, "SalePrice", "OverallCond")
## [[1]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[2]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[3]]
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 259 rows containing non-finite values (stat_smooth).
## Warning: Removed 259 rows containing missing values (geom_point).
##
## [[4]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[5]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[6]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[7]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[8]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[9]]
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
##
## [[10]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[11]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[12]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[13]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[14]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[15]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[16]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[17]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[18]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[19]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[20]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[21]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[22]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[23]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[24]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[25]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[26]]
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 81 rows containing non-finite values (stat_smooth).
## Warning: Removed 81 rows containing missing values (geom_point).
##
## [[27]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[28]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[29]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[30]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[31]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[32]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[33]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[34]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[35]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[36]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[37]]
## `geom_smooth()` using formula 'y ~ x'
##
## [[38]]
## `geom_smooth()` using formula 'y ~ x'
summary(lm(SalePrice ~ OverallQual + GrLivArea, dfHomeSales))
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea, data = dfHomeSales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -379572 -22266 -386 19895 289501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -104092.67 5045.37 -20.63 <2e-16 ***
## OverallQual 32849.05 999.20 32.88 <2e-16 ***
## GrLivArea 55.86 2.63 21.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42500 on 1457 degrees of freedom
## Multiple R-squared: 0.7142, Adjusted R-squared: 0.7138
## F-statistic: 1820 on 2 and 1457 DF, p-value: < 2.2e-16
summary(lm(SalePrice ~ OverallQual*GrLivArea, dfHomeSales))
##
## Call:
## lm(formula = SalePrice ~ OverallQual * GrLivArea, data = dfHomeSales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -527522 -20527 30 17832 271107
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10031.529 12099.368 0.829 0.40719
## OverallQual 15409.364 1948.165 7.910 5.06e-15 ***
## GrLivArea -23.428 8.103 -2.891 0.00389 **
## OverallQual:GrLivArea 11.620 1.128 10.305 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41050 on 1456 degrees of freedom
## Multiple R-squared: 0.7336, Adjusted R-squared: 0.7331
## F-statistic: 1337 on 3 and 1456 DF, p-value: < 2.2e-16